Import Dataset

data <- read.csv('HR_Employee_Data.csv')

Summary of the Data

head(data)
##   ï..Emp_Id satisfaction_level last_evaluation number_project
## 1  IND02438                38%             53%              2
## 2  IND28133                80%             86%              5
## 3  IND07164                11%             88%              7
## 4  IND30478                72%             87%              5
## 5  IND24003                37%             52%              2
## 6  IND08609                41%             50%              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low
str(data)
## 'data.frame':    14999 obs. of  11 variables:
##  $ ï..Emp_Id            : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : chr  "38%" "80%" "11%" "72%" ...
##  $ last_evaluation      : chr  "53%" "86%" "88%" "87%" ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(data)
## Rows: 14,999
## Columns: 11
## $ ï..Emp_Id             <chr> "IND02438", "IND28133", "IND07164", "IND30478", ~
## $ satisfaction_level    <chr> "38%", "80%", "11%", "72%", "37%", "41%", "10%",~
## $ last_evaluation       <chr> "53%", "86%", "88%", "87%", "52%", "50%", "77%",~
## $ number_project        <int> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, ~
## $ average_montly_hours  <int> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142~
## $ time_spend_company    <int> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, ~
## $ Work_accident         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ left                  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ Department            <chr> "sales", "sales", "sales", "sales", "sales", "sa~
## $ salary                <chr> "low", "medium", "medium", "low", "low", "low", ~
summary(data)
##   ï..Emp_Id         satisfaction_level last_evaluation    number_project 
##  Length:14999       Length:14999       Length:14999       Min.   :2.000  
##  Class :character   Class :character   Class :character   1st Qu.:3.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.803  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :7.000  
##  average_montly_hours time_spend_company Work_accident         left       
##  Min.   : 96.0        Min.   : 2.000     Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:156.0        1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :200.0        Median : 3.000     Median :0.0000   Median :0.0000  
##  Mean   :201.1        Mean   : 3.498     Mean   :0.1446   Mean   :0.2381  
##  3rd Qu.:245.0        3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :310.0        Max.   :10.000     Max.   :1.0000   Max.   :1.0000  
##  promotion_last_5years  Department           salary         
##  Min.   :0.00000       Length:14999       Length:14999      
##  1st Qu.:0.00000       Class :character   Class :character  
##  Median :0.00000       Mode  :character   Mode  :character  
##  Mean   :0.02127                                            
##  3rd Qu.:0.00000                                            
##  Max.   :1.00000

Check for Null Values

cbind(lapply(lapply(data, is.na), sum))
##                       [,1]
## ï..Emp_Id             0   
## satisfaction_level    0   
## last_evaluation       0   
## number_project        0   
## average_montly_hours  0   
## time_spend_company    0   
## Work_accident         0   
## left                  0   
## promotion_last_5years 0   
## Department            0   
## salary                0
sum(is.na(data))
## [1] 0

Data Cleaning

data$satisfaction_level<-gsub("%","",as.character(data$satisfaction_level))
data$satisfaction_level=as.integer(data$satisfaction_level)
head(data)
##   ï..Emp_Id satisfaction_level last_evaluation number_project
## 1  IND02438                 38             53%              2
## 2  IND28133                 80             86%              5
## 3  IND07164                 11             88%              7
## 4  IND30478                 72             87%              5
## 5  IND24003                 37             52%              2
## 6  IND08609                 41             50%              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low
data$last_evaluation<-gsub("%","",as.character(data$last_evaluation))
data$last_evaluation=as.integer(data$last_evaluation)
head(data)
##   ï..Emp_Id satisfaction_level last_evaluation number_project
## 1  IND02438                 38              53              2
## 2  IND28133                 80              86              5
## 3  IND07164                 11              88              7
## 4  IND30478                 72              87              5
## 5  IND24003                 37              52              2
## 6  IND08609                 41              50              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low

1)Correlation plot

#install.packages("remotes")
#remotes::install_github("kmaheshkulkarni/corrly")
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.1
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(corrly)
matrixly(data[2:9])
## Warning in plotly::config(., displaylogo = FALSE, collaborate = FALSE): The
## collaborate button is no longer supported
## Warning: 'config' objects don't have these attributes: 'collaborate'
## Valid attributes include:
## 'autosizable', 'displaylogo', 'displayModeBar', 'doubleClick', 'doubleClickDelay', 'editable', 'edits', 'fillFrame', 'frameMargins', 'globalTransforms', 'linkText', 'locale', 'locales', 'logging', 'mapboxAccessToken', 'modeBarButtons', 'modeBarButtonsToAdd', 'modeBarButtonsToRemove', 'notifyOnLogging', 'plotGlPixelRatio', 'plotlyServerURL', 'queueLength', 'responsive', 'scrollZoom', 'sendData', 'setBackground', 'showAxisDragHandles', 'showAxisRangeEntryBoxes', 'showEditInChartStudio', 'showLink', 'showSendToCloud', 'showSources', 'showTips', 'staticPlot', 'toImageButtonOptions', 'topojsonURL', 'watermark'
## Warning: 'heatmap' objects don't have these attributes: 'marker'
## Valid attributes include:
## 'autocolorscale', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'customdata', 'customdatasrc', 'dx', 'dy', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hoverongaps', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'meta', 'metasrc', 'name', 'opacity', 'reversescale', 'showlegend', 'showscale', 'stream', 'text', 'textsrc', 'transforms', 'transpose', 'type', 'uid', 'uirevision', 'visible', 'x', 'x0', 'xaxis', 'xcalendar', 'xgap', 'xhoverformat', 'xperiod', 'xperiod0', 'xperiodalignment', 'xsrc', 'xtype', 'y', 'y0', 'yaxis', 'ycalendar', 'ygap', 'yhoverformat', 'yperiod', 'yperiod0', 'yperiodalignment', 'ysrc', 'ytype', 'z', 'zauto', 'zhoverformat', 'zmax', 'zmid', 'zmin', 'zsmooth', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

2) People who have left in each department

#install.packages('ecodist')
library(plotly)
library(dplyr)
library(ecodist)
## Warning: package 'ecodist' was built under R version 4.1.3
ans=crosstab(data$Department,data$left)
Department=rownames(ans)
fig <- plot_ly(ans,x = ~Department, y = ~X0, type = 'bar', name = 'Working for Company')
fig<- fig %>% add_trace(y =~X1, name = 'Left the Company')
fig <- fig %>% layout(title="Employees who have left based on department", yaxis = list(title = 'Count'), barmode = 'group')
fig

3) People who have left based on salary

#install.packages('ecodist')
library(plotly)
library(dplyr)
library(ecodist)
ans=crosstab(data$salary,data$left)
Salary=rownames(ans)
fig <- plot_ly(ans,x = ~Salary, y = ~X0, type = 'bar', name = 'Working for Company')
fig<- fig %>% add_trace(y =~X1, name = 'Left the Company')
fig <- fig %>% layout(title="Employees who have left based on salary", yaxis = list(title = 'Count'), barmode = 'group')
fig

4) Area plot of time spent in company compared with those who have nand have not left

ans=crosstab(data$time_spend_company,data$left)
ans
##      X0   X1
## 2  3191   53
## 3  4857 1586
## 4  1667  890
## 5   640  833
## 6   509  209
## 7   188    0
## 8   162    0
## 10  214    0
Time_Spent=rownames(ans)
Time_Spent
## [1] "2"  "3"  "4"  "5"  "6"  "7"  "8"  "10"
fig <- plot_ly(ans,x = ~Time_Spent, y = ~X0, type = 'scatter', mode = 'lines', name = 'Working for Company', fill = 'tozeroy')
fig <- fig %>% add_trace(y = ~X1, name = 'Left the Company', fill = 'tozeroy')
fig <- fig %>% layout(xaxis = list(title = 'Time Worked'),
         yaxis = list(title = 'Count'))
fig

Naive Bayes

library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.1.3
## naivebayes 0.9.7 loaded
library(dplyr)
library(ggplot2)
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
## 
## Attaching package: 'psych'
## The following object is masked from 'package:ecodist':
## 
##     distance
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
data$left<-as.factor(data$left)
#data$rank<-as.factor(data$rank)
str(data)
## 'data.frame':    14999 obs. of  11 variables:
##  $ ï..Emp_Id            : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : int  38 80 11 72 37 41 10 92 89 42 ...
##  $ last_evaluation      : int  53 86 88 87 52 50 77 85 100 53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
dataNB<-data[2:11]

#pairs.panels(data) 

#cor(data$gre,data$gpa)
set.seed(234)
smpl<-sample(2,nrow(dataNB),replace=T,prob=c(0.8,0.2))

train<-dataNB[smpl==1,]
test<-dataNB[smpl==2, ]
mdl<-naive_bayes(left~ .,data=train)
#mdl

NB Plot and Prediction

plot(mdl)

p<-predict(mdl,train,type='prob')
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
head(cbind(p,train))
##             0         1 satisfaction_level last_evaluation number_project
## 1 0.147803194 0.8521968                 38              53              2
## 2 0.734647017 0.2653530                 80              86              5
## 3 0.001099585 0.9989004                 11              88              7
## 4 0.486081254 0.5139187                 72              87              5
## 5 0.139957478 0.8600425                 37              52              2
## 6 0.154630169 0.8453698                 41              50              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low
p1<-predict(mdl,train)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab1<-table(p1,train$left))
##    
## p1     0    1
##   0 7369  739
##   1 1763 2127
accuracy=sum(diag(tab1))/sum(tab1)
accuracy
## [1] 0.7914652

Accuracy : [1] 0.7914652

DT MODEL

#install.packages("rpart.plot")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
## Loading required package: rpart
set.seed(234)
dataDT=data[2:11]
smpl<-sample(2,nrow(dataDT),replace=T,prob=c(0.8,0.2))
train<-dataDT[smpl==1,]
test<-dataDT[smpl==2, ]

fit <- rpart(left~., data = train, method = 'class')
rpart.plot(fit, extra = 106)

Prediction

predict_unseen <-predict(fit, test, type = 'class')
table_mat <- table(test$left, predict_unseen)
table_mat
##    predict_unseen
##        0    1
##   0 2270   26
##   1   58  647
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
## [1] 0.9720093

Accuracy: [1] 0.9720093